In [1]:
#Import packages
### YOUR CODE HERE ###
# For data manipulation
import numpy as np
import pandas as pd
# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# For displaying all of the columns in dataframes
pd.set_option('display.max_columns', None)
# For data modeling
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For metrics and helpful functions
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree
# For saving models
import pickle
In [5]:
# RUN THIS CELL TO IMPORT YOUR DATA.
# Load dataset into a dataframe
### YOUR CODE HERE ###
df = pd.read_csv(r"C:\Users\HP\OneDrive\Documents\Google Advanced Data Analytics\Statistics Course\HR_comma_sep.csv")
# Display first few rows of the dataframe
df.head()
Out[5]:
| satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | Department | salary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
In [6]:
# Gather basic information about the data
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14999 entries, 0 to 14998 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction_level 14999 non-null float64 1 last_evaluation 14999 non-null float64 2 number_project 14999 non-null int64 3 average_montly_hours 14999 non-null int64 4 time_spend_company 14999 non-null int64 5 Work_accident 14999 non-null int64 6 left 14999 non-null int64 7 promotion_last_5years 14999 non-null int64 8 Department 14999 non-null object 9 salary 14999 non-null object dtypes: float64(2), int64(6), object(2) memory usage: 1.1+ MB
In [7]:
# Gather descriptive statistics about the data
df.describe()
Out[7]:
| satisfaction_level | last_evaluation | number_project | average_montly_hours | time_spend_company | Work_accident | left | promotion_last_5years | |
|---|---|---|---|---|---|---|---|---|
| count | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 | 14999.000000 |
| mean | 0.612834 | 0.716102 | 3.803054 | 201.050337 | 3.498233 | 0.144610 | 0.238083 | 0.021268 |
| std | 0.248631 | 0.171169 | 1.232592 | 49.943099 | 1.460136 | 0.351719 | 0.425924 | 0.144281 |
| min | 0.090000 | 0.360000 | 2.000000 | 96.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.440000 | 0.560000 | 3.000000 | 156.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.640000 | 0.720000 | 4.000000 | 200.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 0.820000 | 0.870000 | 5.000000 | 245.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 1.000000 | 1.000000 | 7.000000 | 310.000000 | 10.000000 | 1.000000 | 1.000000 | 1.000000 |
In [ ]:
In [ ]:
In [8]:
# Display all column names
df.columns
Out[8]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
'promotion_last_5years', 'Department', 'salary'],
dtype='object')
In [8]:
# Rename columns as needed
df = df.rename(columns={'Work_accident': 'work_accident',
'average_montly_hours': 'average_monthly_hours',
'time_spend_company': 'tenure',
'Department': 'department'})
# Display all column names after the update
df.columns
Out[8]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
'average_monthly_hours', 'tenure', 'work_accident', 'left',
'promotion_last_5years', 'department', 'salary'],
dtype='object')
In [ ]:
In [10]:
df.isna().sum()
Out[10]:
satisfaction_level 0 last_evaluation 0 number_project 0 average_monthly_hours 0 tenure 0 work_accident 0 left 0 promotion_last_5years 0 department 0 salary 0 dtype: int64
In [ ]:
In [11]:
df.duplicated().sum()
Out[11]:
3008
In [ ]:
In [12]:
df[df.duplicated()].head()
Out[12]:
| satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | department | salary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 396 | 0.46 | 0.57 | 2 | 139 | 3 | 0 | 1 | 0 | sales | low |
| 866 | 0.41 | 0.46 | 2 | 128 | 3 | 0 | 1 | 0 | accounting | low |
| 1317 | 0.37 | 0.51 | 2 | 127 | 3 | 0 | 1 | 0 | sales | medium |
| 1368 | 0.41 | 0.52 | 2 | 132 | 3 | 0 | 1 | 0 | RandD | low |
| 1461 | 0.42 | 0.53 | 2 | 142 | 3 | 0 | 1 | 0 | sales | low |
In [9]:
# Drop duplicates and save resulting dataframe in a new variable as needed
df1 = df.drop_duplicates(keep='first')
# Display first few rows of new dataframe as needed
df1.head()
Out[9]:
| satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | department | salary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | sales | low |
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | sales | medium |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | sales | low |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | sales | low |
In [ ]:
In [15]:
plt.figure(figsize=(8, 6)) # Slightly larger figure for better readability
sns.set_style("whitegrid") # Set a nice background grid style
plt.title('Boxplot of Tenure Distribution and Outliers Detection', fontsize=14)
plt.xlabel('Tenure', fontsize=12) # Added x-axis label for clarity
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Plot the boxplot
sns.boxplot(x=df1['tenure'], color='skyblue') # Changed color for a more aesthetic plot
plt.show()
In [ ]:
In [10]:
# Step 1: Compute the 25th and 75th percentile values of `tenure`
percentile25 = df1['tenure'].quantile(0.25)
percentile75 = df1['tenure'].quantile(0.75)
# Step 2: Compute the interquartile range (IQR)
iqr = percentile75 - percentile25
# Step 3: Define the upper and lower bounds for outliers based on IQR
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
# Display the calculated limits
print(f"25th percentile (Q1): {percentile25}")
print(f"75th percentile (Q3): {percentile75}")
print(f"IQR: {iqr}")
print(f"Lower limit for outliers: {lower_limit}")
print(f"Upper limit for outliers: {upper_limit}")
# Step 4: Identify outliers in `tenure`
outliers = df1[(df1['tenure'] < lower_limit) | (df1['tenure'] > upper_limit)]
# Step 5: Report the number of outliers and preview them
num_outliers = len(outliers)
print(f"Number of outliers in 'tenure': {num_outliers}")
print("Outlier data preview:")
outliers.head() # Display first few rows of outliers
25th percentile (Q1): 3.0 75th percentile (Q3): 4.0 IQR: 1.0 Lower limit for outliers: 1.5 Upper limit for outliers: 5.5 Number of outliers in 'tenure': 824 Outlier data preview:
Out[10]:
| satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | department | salary | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | sales | medium |
| 17 | 0.78 | 0.99 | 4 | 255 | 6 | 0 | 1 | 0 | sales | low |
| 34 | 0.84 | 0.87 | 4 | 246 | 6 | 0 | 1 | 0 | hr | low |
| 47 | 0.57 | 0.70 | 3 | 273 | 6 | 0 | 1 | 0 | support | low |
| 67 | 0.90 | 0.98 | 4 | 264 | 6 | 0 | 1 | 0 | product_mng | medium |
In [ ]:
In [ ]:
In [6]:
print(df1['left'].value_counts())
print()
# Get percentages of people who left vs. stayed
### YOUR CODE HERE ###
print(df1['left'].value_counts(normalize=True))
left 0 10000 1 1991 Name: count, dtype: int64 left 0 0.833959 1 0.166041 Name: proportion, dtype: float64
In [ ]:
In [20]:
# Set up figure and axes for two plots
fig, ax = plt.subplots(1, 2, figsize=(22, 8)) # Same figure size, but clarified axes assignment
# Plot 1: Boxplot for average monthly hours by number of projects, with comparison for employees who stayed vs. those who left
sns.boxplot(data=df1, x='average_monthly_hours', y='number_project', hue='left', orient='h', ax=ax[0], palette='Set2')
# Improve aesthetics and labeling
ax[0].invert_yaxis() # Keeps projects in descending order
ax[0].set_title('Average Monthly Hours by Number of Projects (Stay vs. Left)', fontsize=16)
ax[0].set_xlabel('Average Monthly Hours', fontsize=12)
ax[0].set_ylabel('Number of Projects', fontsize=12)
ax[0].legend(title='Employee Left', loc='upper right', fontsize=12)
# Plot 2: Histogram showing distribution of number of projects for employees who stayed vs. left
sns.histplot(data=df1, x='number_project', hue='left', multiple='dodge', shrink=0.8, ax=ax[1], palette='Set1', binwidth=1)
# Improve aesthetics and labeling
ax[1].set_title('Distribution of Number of Projects (Stay vs. Left)', fontsize=16)
ax[1].set_xlabel('Number of Projects', fontsize=12)
ax[1].set_ylabel('Count', fontsize=12)
# Adjust layout and display the plots
plt.tight_layout()
plt.show()
In [ ]:
In [21]:
# Get value counts of stayed/left for employees with 7 projects
df1[df1['number_project']==7]['left'].value_counts()
Out[21]:
left 1 145 Name: count, dtype: int64
In [ ]:
In [23]:
# Create a scatterplot of average monthly hours vs. satisfaction level, comparing employees who stayed vs. those who left
plt.figure(figsize=(16, 9))
# Scatter plot with more vibrant colors and transparency for clarity
sns.scatterplot(data=df1, x='average_monthly_hours', y='satisfaction_level', hue='left', palette='viridis', alpha=0.6, s=100)
# Add a vertical line representing the average monthly hours (166.67)
plt.axvline(x=166.67, color='#ff6361', linestyle='--', linewidth=2, label='166.67 hrs./mo.')
# Update the legend to be more descriptive
plt.legend(title='Employee Left', labels=['Average Monthly Hours (166.67)', 'Left', 'Stayed'], fontsize=12, title_fontsize=14, loc='upper right')
# Improve titles and labels
plt.title('Average Monthly Hours vs. Satisfaction Level (Stay vs. Left)', fontsize=18, fontweight='bold')
plt.xlabel('Average Monthly Hours', fontsize=14)
plt.ylabel('Satisfaction Level', fontsize=14)
# Adjust tick label sizes for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Display the plot
plt.tight_layout()
plt.show()
In [ ]:
In [25]:
# Set the plot style and color palette with only 2 colors for the hue
sns.set_style('whitegrid')
palette = sns.color_palette("Set2", 2) # Limit the palette to 2 colors
# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize=(22, 8))
# Plot 1: Boxplot showing distributions of `satisfaction_level` by tenure, comparing employees who stayed versus those who left
sns.boxplot(data=df1, x='satisfaction_level', y='tenure', hue='left', orient="h", ax=ax[0], palette=palette)
ax[0].invert_yaxis() # Invert y-axis to have the highest tenure at the top
ax[0].set_title('Satisfaction by Tenure', fontsize=16)
ax[0].set_xlabel('Satisfaction Level', fontsize=14)
ax[0].set_ylabel('Tenure (Years)', fontsize=14)
ax[0].tick_params(axis='both', which='major', labelsize=12)
ax[0].grid(True)
# Plot 2: Histogram showing the distribution of `tenure`, comparing employees who stayed versus those who left
sns.histplot(data=df1, x='tenure', hue='left', multiple='dodge', bins=10, palette=palette, ax=ax[1], alpha=0.8)
ax[1].set_title('Tenure Histogram', fontsize=16)
ax[1].set_xlabel('Tenure (Years)', fontsize=14)
ax[1].set_ylabel('Count', fontsize=14)
ax[1].tick_params(axis='both', which='major', labelsize=12)
ax[1].grid(True)
# Adjust layout for better readability
plt.tight_layout()
# Show the plots
plt.show()
In [ ]:
In [7]:
# Calculate mean and median satisfaction scores of employees who left and those who stayed
df1.groupby(['left'])['satisfaction_level'].agg(['mean', 'median'])
Out[7]:
| mean | median | |
|---|---|---|
| left | ||
| 0 | 0.667365 | 0.69 |
| 1 | 0.440271 | 0.41 |
In [ ]:
In [28]:
# Set figure and axes
fig, ax = plt.subplots(1, 2, figsize=(22, 8))
# Define short-tenured employees
tenure_short = df1[df1['tenure'] < 7]
# Define long-tenured employees
tenure_long = df1[df1['tenure'] >= 7] # Use `>=` for better clarity and inclusive grouping
# Plot short-tenured histogram
sns.histplot(data=tenure_short, x='tenure', hue='salary', discrete=True,
hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=0.6,
palette='Set2', ax=ax[0])
# Improve axis labels, title, and grid for the first plot
ax[0].set_title('Salary Distribution by Tenure: Short-Tenured Employees', fontsize=16)
ax[0].set_xlabel('Tenure (Years)', fontsize=14)
ax[0].set_ylabel('Count', fontsize=14)
ax[0].tick_params(axis='both', which='major', labelsize=12)
ax[0].grid(True)
# Plot long-tenured histogram
sns.histplot(data=tenure_long, x='tenure', hue='salary', discrete=True,
hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=0.6,
palette='Set1', ax=ax[1])
# Improve axis labels, title, and grid for the second plot
ax[1].set_title('Salary Distribution by Tenure: Long-Tenured Employees', fontsize=16)
ax[1].set_xlabel('Tenure (Years)', fontsize=14)
ax[1].set_ylabel('Count', fontsize=14)
ax[1].tick_params(axis='both', which='major', labelsize=12)
ax[1].grid(True)
# Adjust layout for better spacing
plt.tight_layout()
# Display the plots
plt.show()
In [ ]:
In [30]:
# Set the figure size and axes
plt.figure(figsize=(16, 6))
# Create scatterplot to examine the relationship between average monthly hours and promotion in the last 5 years
sns.scatterplot(data=df1, x='average_monthly_hours', y='promotion_last_5years', hue='left',
palette='coolwarm', alpha=0.6, s=100)
# Add a vertical line representing the average monthly hours (166.67)
plt.axvline(x=166.67, color='#ff6361', linestyle='--', linewidth=2, label='166.67 hrs./mo.')
# Update the legend with more descriptive labels
plt.legend(title='Employee Status', labels=['Average Monthly Hours (166.67)', 'Left', 'Stayed'],
fontsize=12, title_fontsize=14, loc='upper right')
# Improve title and axis labels
plt.title('Relationship between Monthly Hours and Promotion in Last 5 Years', fontsize=18, fontweight='bold')
plt.xlabel('Average Monthly Hours', fontsize=14)
plt.ylabel('Promotion in Last 5 Years', fontsize=14)
# Adjust tick label sizes for better readability
plt.xticks(fontsize=12)
plt.yticks([0, 1], labels=['No Promotion', 'Promoted'], fontsize=12) # Customize y-axis labels
# Display the plot with optimized layout
plt.tight_layout()
plt.show()
In [ ]:
In [31]:
# Set the plot style and color palette
sns.set_style('whitegrid')
palette = sns.color_palette("coolwarm", 2) # Choose a palette that differentiates 'left' and 'stayed'
# Create scatterplot of `average_monthly_hours` versus `last_evaluation`
plt.figure(figsize=(16, 9))
sns.scatterplot(data=df1, x='average_monthly_hours', y='last_evaluation', hue='left', alpha=0.6, palette=palette)
# Add vertical line for reference at 166.67 hours per month
plt.axvline(x=166.67, color='#ff6361', label='166.67 hrs./mo.', ls='--', linewidth=2)
# Set plot title and labels
plt.title('Average Monthly Hours vs Last Evaluation Score', fontsize=18)
plt.xlabel('Average Monthly Hours', fontsize=14)
plt.ylabel('Last Evaluation Score', fontsize=14)
# Adjust tick parameters for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Improve legend labels
plt.legend(labels=['166.67 hrs./mo.', 'Left', 'Stayed'], title='Legend', fontsize=12, title_fontsize=14)
# Add grid for better readability
plt.grid(True)
# Show plot
plt.show()
In [ ]:
In [32]:
df1["department"].value_counts()
Out[32]:
department sales 3239 technical 2244 support 1821 IT 976 RandD 694 product_mng 686 marketing 673 accounting 621 hr 601 management 436 Name: count, dtype: int64
In [ ]:
In [37]:
# Set the plot style and color palette
sns.set_style('whitegrid')
# Create a figure for the stacked histogram
plt.figure(figsize=(11, 8))
# Create a palette with exactly two colors for the 'left' hue
palette = sns.color_palette("Set2", n_colors=2)
# Create a histogram showing department-wise distribution of employees who stayed vs left
sns.histplot(data=df1, x='department', hue='left', discrete=True,
hue_order=[0, 1], multiple='dodge', shrink=0.8, palette=palette)
# Rotate and align x-axis labels for better readability
plt.xticks(rotation=45, ha='right', fontsize=12)
# Set plot title and axis labels
plt.title('Department-wise Distribution of Employees: Stayed vs Left', fontsize=16)
plt.xlabel('Department', fontsize=14)
plt.ylabel('Count', fontsize=14)
# Adjust tick size for better readability
plt.yticks(fontsize=12)
# Add gridlines for visual clarity
plt.grid(True, axis='y', linestyle='--')
# Improve the legend
plt.legend(title='Status', labels=['Stayed', 'Left'], fontsize=12, title_fontsize=14)
# Adjust layout for better spacing
plt.tight_layout()
# Show the plot
plt.show()
In [ ]:
In [11]:
# Copy the dataframe
df_enc = df1.copy()
# Encode the `salary` column as an ordinal numeric category (with handling of missing values, if any)
df_enc['salary'] = (
df_enc['salary'].astype('category')
.cat.set_categories(['low', 'medium', 'high'], ordered=True)
.cat.codes
)
# Dummy encode the `department` column with a clear prefix for each dummy variable
df_enc = pd.get_dummies(df_enc, columns=['department'], drop_first=True, prefix='dept')
# Display the first few rows and structure of the new dataframe
df_enc.head()
Out[11]:
| satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | salary | dept_RandD | dept_accounting | dept_hr | dept_management | dept_marketing | dept_product_mng | dept_sales | dept_support | dept_technical | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 1 | 0.80 | 0.86 | 5 | 262 | 6 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | True | False | False |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | True | False | False |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
In [ ]:
In [40]:
# Set the figure size
plt.figure(figsize=(10, 8))
# Create the heatmap for selected features with improvements
heatmap = sns.heatmap(df_enc[['satisfaction_level', 'last_evaluation', 'number_project',
'average_monthly_hours', 'tenure']].corr(),
annot=True, fmt=".2f", linewidths=0.5,
cmap="crest", cbar_kws={'shrink': 0.75})
# Customize colorbar tick labels for better readability
colorbar = heatmap.collections[0].colorbar
colorbar.ax.tick_params(labelsize=12)
# Improve title and label aesthetics
heatmap.set_title('Correlation Heatmap of Key Variables', fontsize=18, fontweight='bold', pad=20)
# Rotate x and y tick labels for better visibility
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, fontsize=12)
heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0, fontsize=12)
# Adjust layout for better spacing and readability
plt.tight_layout()
# Show the heatmap
plt.show()
In [ ]:
In [41]:
# Create a crosstab of the counts of employees who left versus stayed by department
crosstab = pd.crosstab(df1['department'], df1['left'])
# Set the figure size
plt.figure(figsize=(12, 8))
# Plot the bar chart with improved logic and aesthetics
crosstab.plot(kind='bar', color=['purple', 'red'], edgecolor='black', alpha=0.8)
# Add titles and labels
plt.title('Employee Count by Department: Stayed vs. Left', fontsize=16, fontweight='bold')
plt.ylabel('Employee Count', fontsize=14)
plt.xlabel('Department', fontsize=14)
# Improve the legend
plt.legend(title='Employee Status', labels=['Stayed (0)', 'Left (1)'], fontsize=12, title_fontsize=14)
# Adjust tick size for better readability
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Add gridlines for clarity
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Show the plot
plt.tight_layout()
plt.show()
<Figure size 1200x800 with 0 Axes>
In [ ]:
In [ ]:
In [12]:
# Select rows without outliers in `tenure` and save resulting dataframe in a new variable
df_log = df_enc[(df_enc['tenure'] >= lower_limit) & (df_enc['tenure'] <= upper_limit)]
# Display first few rows of new dataframe
df_log.head()
Out[12]:
| satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | left | promotion_last_5years | salary | dept_RandD | dept_accounting | dept_hr | dept_management | dept_marketing | dept_product_mng | dept_sales | dept_support | dept_technical | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 1 | 0 | 1 | False | False | False | False | False | False | True | False | False |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 5 | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 1 | 0 | 0 | False | False | False | False | False | False | True | False | False |
In [ ]:
In [10]:
# Isolate the outcome variable
y = df_log['left']
# Display first few rows of the outcome variable
y.head()
Out[10]:
0 1 2 1 3 1 4 1 5 1 Name: left, dtype: int64
In [ ]:
In [26]:
df_log['left'].value_counts()
left
0 9285
1 1882
Name: count, dtype: int64
Out[26]:
left 0 9285 1 1882 Name: count, dtype: int64
In [11]:
# Select the features you want to use in your model
X = df_log.drop('left', axis=1)
# Display the first few rows of the selected features
X.head()
Out[11]:
| satisfaction_level | last_evaluation | number_project | average_monthly_hours | tenure | work_accident | promotion_last_5years | salary | dept_RandD | dept_accounting | dept_hr | dept_management | dept_marketing | dept_product_mng | dept_sales | dept_support | dept_technical | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.38 | 0.53 | 2 | 157 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 2 | 0.11 | 0.88 | 7 | 272 | 4 | 0 | 0 | 1 | False | False | False | False | False | False | True | False | False |
| 3 | 0.72 | 0.87 | 5 | 223 | 5 | 0 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 4 | 0.37 | 0.52 | 2 | 159 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | True | False | False |
| 5 | 0.41 | 0.50 | 2 | 153 | 3 | 0 | 0 | 0 | False | False | False | False | False | False | True | False | False |
In [13]:
Out[13]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
'average_monthly_hours', 'tenure', 'work_accident',
'promotion_last_5years', 'salary', 'dept_RandD', 'dept_accounting',
'dept_hr', 'dept_management', 'dept_marketing', 'dept_product_mng',
'dept_sales', 'dept_support', 'dept_technical'],
dtype='object')
In [47]:
# Split the data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
In [ ]:
In [ ]:
In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# Scale the features for better convergence
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Construct a logistic regression model with improved parameters
log_clf = LogisticRegression(random_state=42, max_iter=1000, solver='lbfgs', class_weight='balanced') # Adjusted solver and class weight
log_clf.fit(X_train_scaled, y_train)
# Use the logistic regression model to get predictions on the scaled test set
y_pred = log_clf.predict(X_test_scaled)
# Compute values for confusion matrix
log_cm = confusion_matrix(y_test, y_pred, labels=log_clf.classes_)
# Create a display of the confusion matrix
log_disp = ConfusionMatrixDisplay(confusion_matrix=log_cm,
display_labels=log_clf.classes_)
In [51]:
# Create a DataFrame to compare predicted vs actual values
results_df = pd.DataFrame({
'Actual': y_test,
'Predicted': y_pred,
'Correct': y_test == y_pred # Boolean column indicating if the prediction was correct
})
# Display the first few rows of the results DataFrame
results_df.head()
Out[51]:
| Actual | Predicted | Correct | |
|---|---|---|---|
| 10368 | 0 | 0 | True |
| 6408 | 0 | 0 | True |
| 6129 | 0 | 0 | True |
| 964 | 1 | 0 | False |
| 11657 | 0 | 0 | True |
In [50]:
# Plot the confusion matrix with enhancements
plt.figure(figsize=(10, 8))
log_disp.plot(cmap=plt.cm.Blues, values_format='d', ax=plt.gca(), colorbar=True)
# Set titles and labels for clarity
plt.title('Confusion Matrix for Logistic Regression Model', fontsize=20, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=16)
plt.ylabel('True Label', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
# Add a grid for better visual reference
plt.grid(False)
# Display metrics on the confusion matrix
for i in range(len(log_cm)):
for j in range(len(log_cm)):
plt.text(j, i, f'{log_cm[i, j]}', ha='center', va='center', color='white' if log_cm[i, j] > log_cm.max()/2 else 'black', fontsize=14)
# Show the plot
plt.tight_layout()
plt.show()
In [ ]:
In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
classification_report,
accuracy_score,
confusion_matrix,
roc_curve,
auc,
precision_recall_curve,
average_precision_score,
)
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# Create a logistic regression model and fit it
log_clf = LogisticRegression(random_state=42, max_iter=5000).fit(X_train, y_train)
# Use the model to get predictions on the test set
y_pred = log_clf.predict(X_test)
y_proba = log_clf.predict_proba(X_test)[:, 1] # Get predicted probabilities for the positive class
# Class distribution
class_distribution = df_log['left'].value_counts(normalize=True)
print("Class Distribution:")
print(class_distribution)
# Create classification report
target_names = ['Predicted would not leave', 'Predicted would leave']
class_report = classification_report(y_test, y_pred, target_names=target_names)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Logistic Regression Model: {accuracy:.2f}\n")
# Display classification report
print("Classification Report:")
print(class_report)
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Class Distribution:
left
0 0.831468
1 0.168532
Name: proportion, dtype: float64
Accuracy of the Logistic Regression Model: 0.82
Classification Report:
precision recall f1-score support
Predicted would not leave 0.86 0.93 0.90 2321
Predicted would leave 0.44 0.26 0.33 471
accuracy 0.82 2792
macro avg 0.65 0.60 0.61 2792
weighted avg 0.79 0.82 0.80 2792
In [55]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)
average_precision = average_precision_score(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f'Precision-Recall Curve (AP = {average_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()
In [56]:
# Cross-validation scores
cv_scores = cross_val_score(log_clf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validated Accuracy: {np.mean(cv_scores):.2f} +/- {np.std(cv_scores):.2f}")
Cross-Validated Accuracy: 0.83 +/- 0.00
In [57]:
# Display feature importance
importance = np.abs(log_clf.coef_[0])
feature_names = X_train.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance from Logistic Regression')
plt.show()
In [ ]:
In [ ]:
In [ ]:
In [14]:
# import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
# Assume df_enc is your encoded dataframe from the previous steps
# Define the specific 7 features for training
selected_features = ['satisfaction_level', 'last_evaluation', 'number_project',
'average_monthly_hours', 'tenure', 'salary', 'work_accident']
# Select only the chosen features
X = df_log[selected_features] # Features (only the specified 7)
y = df_log['left'] # Target
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the RandomForestClassifier with specified defaults
rf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight= 'balanced', criterion='gini', max_depth=None,
min_impurity_decrease=0.0, min_samples_leaf=1,
min_samples_split=2, n_estimators=1000)
# Fit the model to the training data
rf.fit(X_train, y_train)
Out[14]:
RandomForestClassifier(class_weight='balanced', n_estimators=1000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', n_estimators=1000)
In [21]:
# Save X_train, X_test, y_train, and y_test to CSV files in the specified path
output_path = "C:/Users/HP/OneDrive/Documents/"
# Save training and test sets
X_train.to_csv(output_path + "X_train.csv", index=False)
X_test.to_csv(output_path + "X_test.csv", index=False)
y_train.to_csv(output_path + "y_train.csv", index=False)
y_test.to_csv(output_path + "y_test.csv", index=False)
print("Files have been saved successfully!")
Files have been saved successfully!
In [16]:
X.columns
Out[16]:
Index(['satisfaction_level', 'last_evaluation', 'number_project',
'average_monthly_hours', 'tenure', 'work_accident',
'promotion_last_5years', 'salary', 'dept_RandD', 'dept_accounting',
'dept_hr', 'dept_management', 'dept_marketing', 'dept_product_mng',
'dept_sales', 'dept_support', 'dept_technical'],
dtype='object')
In [16]:
# Make predictions on the test set
y_pred = rf.predict(X_test)
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")
# Detailed classification report
print(classification_report(y_test, y_pred))
Test set accuracy: 0.9839
precision recall f1-score support
0 0.98 1.00 0.99 1846
1 0.99 0.92 0.95 388
accuracy 0.98 2234
macro avg 0.98 0.96 0.97 2234
weighted avg 0.98 0.98 0.98 2234
In [23]:
# Create a DataFrame to compare predicted vs actual values
results_df = pd.DataFrame({
'Actual': y_test,
'Predicted': y_pred,
'Correct': y_test == y_pred # Boolean column indicating if the prediction was correct
})
# Display the first few rows of the results DataFrame
results_df.head(10)
Out[23]:
| Actual | Predicted | Correct | |
|---|---|---|---|
| 3830 | 0 | 0 | True |
| 7180 | 0 | 0 | True |
| 988 | 1 | 1 | True |
| 157 | 1 | 1 | True |
| 11854 | 0 | 0 | True |
| 7446 | 0 | 0 | True |
| 3443 | 0 | 0 | True |
| 4895 | 0 | 0 | True |
| 7057 | 0 | 0 | True |
| 1594 | 1 | 1 | True |
In [15]:
from sklearn.inspection import permutation_importance
# Calculate permutation importance
perm_importance = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)
# Create a DataFrame
perm_df = pd.DataFrame({
'Feature': X_test.columns,
'Importance': perm_importance.importances_mean
})
# Sort by importance and select top 7
perm_df = perm_df.sort_values(by='Importance', ascending=False).head(7)
# Plot permutation feature importance for top 7 features
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=perm_df, hue='Feature', dodge=False, palette='magma', legend=False)
# Set title and labels
plt.title('Top 7 Permutation Feature Importance', fontsize=16)
plt.xlabel('Mean Decrease in Accuracy', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Adjust layout and show plot
plt.tight_layout()
plt.show()
In [27]:
df_log.info()
<class 'pandas.core.frame.DataFrame'> Index: 11167 entries, 0 to 11999 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 satisfaction_level 11167 non-null float64 1 last_evaluation 11167 non-null float64 2 number_project 11167 non-null int64 3 average_monthly_hours 11167 non-null int64 4 tenure 11167 non-null int64 5 work_accident 11167 non-null int64 6 left 11167 non-null int64 7 promotion_last_5years 11167 non-null int64 8 salary 11167 non-null int8 9 dept_RandD 11167 non-null bool 10 dept_accounting 11167 non-null bool 11 dept_hr 11167 non-null bool 12 dept_management 11167 non-null bool 13 dept_marketing 11167 non-null bool 14 dept_product_mng 11167 non-null bool 15 dept_sales 11167 non-null bool 16 dept_support 11167 non-null bool 17 dept_technical 11167 non-null bool dtypes: bool(9), float64(2), int64(6), int8(1) memory usage: 894.2 KB
In [ ]:
In [20]:
import joblib
# Save the model
joblib.dump(rf, 'random_forest_model.joblib')
# Load the model later
loaded_model = joblib.load('random_forest_model.joblib')
In [18]:
import joblib
# Load the model later
loaded_model = joblib.load('random_forest_model')
In [ ]:
# Function to take user input
def get_user_input():
print("Please enter the following details about the employee:")
satisfaction_level = float(input("Satisfaction level (0-1): "))
last_evaluation = float(input("Last evaluation (0-1): "))
number_project = int(input("Number of projects: "))
average_monthly_hours = int(input("Average monthly hours: "))
tenure = int(input("Tenure (in years): "))
work_accident = int(input("Had work accident? (0 = No, 1 = Yes): "))
salary = int(input("Salary level (0 = low, 1 = medium, 2 = high): "))
# Create the input data in the same format as training data
input_data = np.array([[satisfaction_level, last_evaluation, number_project,
average_monthly_hours, tenure, work_accident, salary]])
return input_data
# Get user input
new_data = get_user_input()
# Make a prediction
prediction = loaded_model.predict(new_data)
# Output the prediction
if prediction[0] == 1:
print("Prediction: The employee is likely to leave.")
else:
print("Prediction: The employee is likely to stay.")
Please enter the following details about the employee:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: